date: 2018-10-29
R setup
knitr::opts_chunk$set(echo = TRUE, fig.width = 9, fig.height = 9)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(knitr)
library(readxl)
library(moments)
library(epiR)
## Warning: package 'epiR' was built under R version 3.4.4
## Loading required package: survival
## Package epiR 0.9-97 is loaded
## Type help(epi.about) for summary information
##
library(e1071)
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:moments':
##
## kurtosis, moment, skewness
library(lsmeans)
## Warning: package 'lsmeans' was built under R version 3.4.4
## The 'lsmeans' package is being deprecated.
## Users are encouraged to switch to 'emmeans'.
## See help('transition') for more information, including how
## to convert 'lsmeans' objects and scripts to work with 'emmeans'.
library(lmSupport)
## Warning: package 'lmSupport' was built under R version 3.4.4
library(ppcor)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(tableone)
## Warning: package 'tableone' was built under R version 3.4.4
library(sjPlot)
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.12
## Current Matrix version is 1.2.13
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(leaps)
Load data
rm(list = ls())
## define directory
analysisDir <- '/Users/matthewmoll/Documents/Fellowship/MPH/Fall2018/BST213_regression/homework/'
## Read in data
cost <- read_excel("/Users/matthewmoll/Documents/Fellowship/MPH/Fall2018/BST213_regression/homework/cost.xls")
Data structure
head(cost)
## # A tibble: 6 x 14
## outcosts2 ptage ssi panic anxiety married female educat iadl social
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 83. 38.9 1.00 0. 0. 1. 1. 5. 100. 100.
## 2 2162. 53.3 1.00 0. 0. 0. 1. 2. 91.7 100.
## 3 2203. 42.7 1.08 0. 0. 0. 1. 1. 75.0 100.
## 4 0. 70.2 1.15 0. 0. 0. 1. 1. 46.7 100.
## 5 4050. 41.1 1.00 0. 0. 1. 1. 3. 100. 100.
## 6 815. 38.4 1.15 0. 0. 0. 1. 2. 83.3 100.
## # ... with 4 more variables: racecat <dbl>, whiteleycat <dbl>,
## # charlson <dbl>, depcat <dbl>
str(cost)
## Classes 'tbl_df', 'tbl' and 'data.frame': 461 obs. of 14 variables:
## $ outcosts2 : num 83 2162 2203 0 4050 ...
## $ ptage : num 38.9 53.3 42.7 70.2 41.1 ...
## $ ssi : num 1 1 1.08 1.15 1 ...
## $ panic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anxiety : num 0 0 0 0 0 0 0 0 0 0 ...
## $ married : num 1 0 0 0 1 0 1 0 0 0 ...
## $ female : num 1 1 1 1 1 1 0 1 0 0 ...
## $ educat : num 5 2 1 1 3 2 5 3 2 2 ...
## $ iadl : num 100 91.7 75 46.7 100 ...
## $ social : num 100 100 100 100 100 ...
## $ racecat : num 4 2 2 1 1 2 4 2 1 1 ...
## $ whiteleycat: num 1 1 1 1 1 1 1 1 1 1 ...
## $ charlson : num 0 1 1 2 0 3 0 2 0 1 ...
## $ depcat : num 3 3 3 3 3 3 3 3 3 3 ...
sapply(cost, summary)
## $outcosts2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 307 1016 1836 2463 11909
##
## $ptage
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.96 32.24 44.62 45.16 55.21 89.68 1
##
## $ssi
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.231 1.538 1.738 2.077 5.000 6
##
## $panic
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06291 0.00000 1.00000
##
## $anxiety
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06508 0.00000 1.00000
##
## $married
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.4096 1.0000 1.0000 2
##
## $female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 0.744 1.000 1.000
##
## $educat
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 2.000 3.000 3.359 5.000 5.000 1
##
## $iadl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 72.22 94.44 81.82 100.00 100.00
##
## $social
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 100.00 100.00 89.86 100.00 100.00 2
##
## $racecat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.297 4.000 4.000
##
## $whiteleycat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.456 2.000 3.000
##
## $charlson
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5662 1.0000 7.0000
##
## $depcat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.00 3.00 2.72 3.00 3.00
Define variables
The goal for this assignment is to build a linear regression model using the priniciples discussed in class, and then write it into a methods and results section of a manuscript.
removevars <- c()
## Define response variables
responseVars <- c('outcosts2')
responseVars
## [1] "outcosts2"
## Define explanatory variables
explanVars <- names(cost)[!names(cost) %in% c("outcosts2")]
explanVars
## [1] "ptage" "ssi" "panic" "anxiety" "married"
## [6] "female" "educat" "iadl" "social" "racecat"
## [11] "whiteleycat" "charlson" "depcat"
## List of quantitative variables
quantVars <- names(cost[!names(cost) %in% removevars &
sapply(cost, function(x)
length(levels(as.factor(x)))>7)])
quantVars
## [1] "outcosts2" "ptage" "ssi" "iadl" "social" "charlson"
## List of binary variables
binVars <- names(cost[!names(cost) %in% removevars &
sapply(cost, function(x)
length(levels(as.factor(x)))==2)])
binVars
## [1] "panic" "anxiety" "married" "female"
## List of categorical variables
catVars <- names(cost[!names(cost) %in% c(removevars,binVars) &
sapply(cost, function(x)
length(levels(as.factor(x)))<=7 & length(levels(as.factor(x)))>1)])
catVars
## [1] "educat" "racecat" "whiteleycat" "depcat"
## Get Intersection between quantVars and explanVars and responseVars
explanVars.quant <- intersect(explanVars,quantVars)
responseVars.quant <- intersect(responseVars,quantVars)
## Get intersection between categorical and binary variables with explanatory and response vars
explanVars.cat <- intersect(explanVars,catVars)
explanVars.cat <- explanVars.cat[!explanVars.cat %in% c(binVars)]
responseVars.cat <- intersect(responseVars, catVars)
responseVars.cat <- responseVars.cat[!responseVars.cat %in% c(binVars)]
## Get binary explanatory and response variables
explanVars.bin <- intersect(explanVars, binVars)
responseVars.bin <- intersect(responseVars, binVars)
## View data ranges
str(cost)
## Classes 'tbl_df', 'tbl' and 'data.frame': 461 obs. of 14 variables:
## $ outcosts2 : num 83 2162 2203 0 4050 ...
## $ ptage : num 38.9 53.3 42.7 70.2 41.1 ...
## $ ssi : num 1 1 1.08 1.15 1 ...
## $ panic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anxiety : num 0 0 0 0 0 0 0 0 0 0 ...
## $ married : num 1 0 0 0 1 0 1 0 0 0 ...
## $ female : num 1 1 1 1 1 1 0 1 0 0 ...
## $ educat : num 5 2 1 1 3 2 5 3 2 2 ...
## $ iadl : num 100 91.7 75 46.7 100 ...
## $ social : num 100 100 100 100 100 ...
## $ racecat : num 4 2 2 1 1 2 4 2 1 1 ...
## $ whiteleycat: num 1 1 1 1 1 1 1 1 1 1 ...
## $ charlson : num 0 1 1 2 0 3 0 2 0 1 ...
## $ depcat : num 3 3 3 3 3 3 3 3 3 3 ...
sapply(cost, summary)
## $outcosts2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 307 1016 1836 2463 11909
##
## $ptage
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.96 32.24 44.62 45.16 55.21 89.68 1
##
## $ssi
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.231 1.538 1.738 2.077 5.000 6
##
## $panic
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06291 0.00000 1.00000
##
## $anxiety
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06508 0.00000 1.00000
##
## $married
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.4096 1.0000 1.0000 2
##
## $female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 0.744 1.000 1.000
##
## $educat
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 2.000 3.000 3.359 5.000 5.000 1
##
## $iadl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 72.22 94.44 81.82 100.00 100.00
##
## $social
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 100.00 100.00 89.86 100.00 100.00 2
##
## $racecat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.297 4.000 4.000
##
## $whiteleycat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.456 2.000 3.000
##
## $charlson
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5662 1.0000 7.0000
##
## $depcat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.00 3.00 2.72 3.00 3.00
## turn binary and category variables into factors
varsToFactor <- c(catVars, binVars)
cost[,varsToFactor] <- sapply(cost[,varsToFactor],as.factor)
Normality assessment
cost_quant <- as.data.frame(cost %>% dplyr::select(quantVars))
normalityfun <- function(dataset) {
require(moments)
require(e1071)
print(paste0("Summary Statistics: ", names(dataset)[i]))
print(paste0("Mean: ", mean(as.numeric(dataset[,i]), na.rm = T)))
print(paste0("Standard Deviation: ",sd(as.numeric(dataset[,i]), na.rm = T)))
print(paste0("Median: ", median(as.numeric(dataset[,i]), na.rm = T)))
print(paste0("Skewness: ", skewness(as.numeric(dataset[,i]), na.rm = T)))
print(paste0("Kurtosis: ", e1071::kurtosis(as.numeric(dataset[,i]), type = 2)))
print(paste0("Shapiro-Wilk Test: ", shapiro.test(as.numeric(dataset[,i]))$p.value))
print(" ")
print(" ")
qqnorm(as.numeric(dataset[,i]), main = paste0("Normal Q-Q plot: ",names(dataset)[i]))
qqline(as.numeric(dataset[,i]), col = 2)
}
for(i in 1:length(quantVars)) {
print(ggplot() +
geom_histogram(data = cost_quant, fill = "blue", alpha = 0.5, mapping =
aes(x = get(quantVars[i]))) +
xlab(quantVars[i]) + labs(title = "Histogram")
)
normalityfun(cost_quant)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: outcosts2"
## [1] "Mean: 1836.48373101952"
## [1] "Standard Deviation: 2195.4502208015"
## [1] "Median: 1016"
## [1] "Skewness: 1.86911992818483"
## [1] "Kurtosis: 3.71673113110437"
## [1] "Shapiro-Wilk Test: 1.32694421506979e-24"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: ptage"
## [1] "Mean: 45.1636818141237"
## [1] "Standard Deviation: 15.4858207820523"
## [1] "Median: 44.621492128679"
## [1] "Skewness: 0.422861613908007"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 9.58087935521775e-08"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: ssi"
## [1] "Mean: 1.73845513460898"
## [1] "Standard Deviation: 0.693705411185324"
## [1] "Median: 1.53846153846154"
## [1] "Skewness: 1.35710574430674"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 1.83131200837982e-19"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: iadl"
## [1] "Mean: 81.8185104844541"
## [1] "Standard Deviation: 25.5994105151121"
## [1] "Median: 94.4444444444444"
## [1] "Skewness: -1.54182530496445"
## [1] "Kurtosis: 1.54732926242677"
## [1] "Shapiro-Wilk Test: 3.82337675082766e-26"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: social"
## [1] "Mean: 89.8571774388768"
## [1] "Standard Deviation: 22.5952089828678"
## [1] "Median: 100"
## [1] "Skewness: -2.50691098205916"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 1.85402296574893e-33"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: charlson"
## [1] "Mean: 0.566160520607375"
## [1] "Standard Deviation: 1.07464059120437"
## [1] "Median: 0"
## [1] "Skewness: 2.57120849300461"
## [1] "Kurtosis: 8.03575657287451"
## [1] "Shapiro-Wilk Test: 1.66716672242903e-31"
## [1] " "
## [1] " "

The only variable that looks close to normal is patient age, which we may be able to use parametric testing by invoking the central limit theorem. Otherwise, we will report median and interquartile ranges.
Missingness
## assess for and remove > 10% missingness
## Set threshold
thr <- 0.1*length(cost$outcosts2)
sapply(cost, function(x) ifelse(sum(is.na(x)) > thr, 1, 0))
## outcosts2 ptage ssi panic anxiety married
## 0 0 0 0 0 0
## female educat iadl social racecat whiteleycat
## 0 0 0 0 0 0
## charlson depcat
## 0 0
sapply(cost,function(x) sum(is.na(x)))
## outcosts2 ptage ssi panic anxiety married
## 0 1 6 0 0 2
## female educat iadl social racecat whiteleycat
## 0 1 0 2 0 0
## charlson depcat
## 0 0
## Create complete dataset of phenotypic variables
cost <- na.omit(cost)
No variables were removed for missingness. However, ssi has the greatest missingness.
Table 1: Demographics
## Add labels to table
labelled::var_label(cost$outcosts2) <- "Outpatient costs ($)"
labelled::var_label(cost$ptage) <- "Age in years"
labelled::var_label(cost$ssi) <- "Somatic Symptom Index (No. (%))"
labelled::var_label(cost$panic) <- "Panic (No. (%))"
labelled::var_label(cost$anxiety) <- "Anxiety (No. (%))"
labelled::var_label(cost$married) <- "Married (No. (%))"
labelled::var_label(cost$female) <- "Female (No. (%))"
labelled::var_label(cost$educat) <- "Education Level (No. (%))"
labelled::var_label(cost$iadl) <- "IADL score"
labelled::var_label(cost$social) <- "Social Activities Score"
labelled::var_label(cost$racecat) <- "Race (No. (%))"
labelled::var_label(cost$whiteleycat) <- "Whiteley Category"
labelled::var_label(cost$charlson) <- "Charlson index"
labelled::var_label(cost$depcat) <- "Depression Category (No. (%))"
## Group variables
demovars <- names(cost)
demovars
## [1] "outcosts2" "ptage" "ssi" "panic" "anxiety"
## [6] "married" "female" "educat" "iadl" "social"
## [11] "racecat" "whiteleycat" "charlson" "depcat"
varsToFactor <- c(catVars, binVars)
nonNormalVars <- names(cost_quant)[!names(cost_quant) %in% c("ptage")]
## Table one
tableOne <- CreateTableOne(vars = demovars, data = cost, factorVars = varsToFactor, testNonNormal = nonNormalVars)
tableOne
##
## Overall
## n 449
## outcosts2 (mean (sd)) 1829.86 (2205.83)
## ptage (mean (sd)) 45.22 (15.55)
## ssi (mean (sd)) 1.74 (0.69)
## panic = 1 (%) 28 ( 6.2)
## anxiety = 1 (%) 28 ( 6.2)
## married = 1 (%) 184 (41.0)
## female = 1 (%) 332 (73.9)
## educat (%)
## 1 39 ( 8.7)
## 2 100 (22.3)
## 3 91 (20.3)
## 4 94 (20.9)
## 5 125 (27.8)
## iadl (mean (sd)) 81.80 (25.72)
## social (mean (sd)) 89.73 (22.78)
## racecat (%)
## 1 174 (38.8)
## 2 97 (21.6)
## 3 48 (10.7)
## 4 130 (29.0)
## whiteleycat (%)
## 1 297 (66.1)
## 2 100 (22.3)
## 3 52 (11.6)
## charlson (mean (sd)) 0.57 (1.07)
## depcat (%)
## 1 45 (10.0)
## 2 36 ( 8.0)
## 3 368 (82.0)
## Save as csv and word document
tableOne_out <- print(tableOne,quote = FALSE, noSpaces = TRUE, printToggle = FALSE, varLabel = TRUE, nonnormal = nonNormalVars)
write.csv(tableOne_out, file = paste0(analysisDir,"TableOne.csv"))
tableOne_out <- fread(paste0(analysisDir, "TableOne.csv"))
colnames(tableOne_out) <- c("","")
tab_df(tableOne_out, file = paste0(analysisDir, "TableOne.doc"))
|
|
|
|
n
|
449
|
|
Outpatient costs ($) (median [IQR])
|
998.00 [305.00, 2463.00]
|
|
Age in years (mean (sd))
|
45.22 (15.55)
|
|
Somatic Symptom Index (No. (%)) (median [IQR])
|
1.54 [1.23, 2.08]
|
|
Panic (No. (%)) = 1 (%)
|
28 (6.2)
|
|
Anxiety (No. (%)) = 1 (%)
|
28 (6.2)
|
|
Married (No. (%)) = 1 (%)
|
184 (41.0)
|
|
Female (No. (%)) = 1 (%)
|
332 (73.9)
|
|
Education Level (No. (%)) (%)
|
|
|
1
|
39 (8.7)
|
|
2
|
100 (22.3)
|
|
3
|
91 (20.3)
|
|
4
|
94 (20.9)
|
|
5
|
125 (27.8)
|
|
IADL score (median [IQR])
|
94.44 [72.22, 100.00]
|
|
Social Activities Score (median [IQR])
|
100.00 [100.00, 100.00]
|
|
Race (No. (%)) (%)
|
|
|
1
|
174 (38.8)
|
|
2
|
97 (21.6)
|
|
3
|
48 (10.7)
|
|
4
|
130 (29.0)
|
|
Whiteley Category (%)
|
|
|
1
|
297 (66.1)
|
|
2
|
100 (22.3)
|
|
3
|
52 (11.6)
|
|
Charlson index (median [IQR])
|
0.00 [0.00, 1.00]
|
|
Depression Category (No. (%)) (%)
|
|
|
1
|
45 (10.0)
|
|
2
|
36 (8.0)
|
|
3
|
368 (82.0)
|
Scatterplots and correlation coefficients
## Scatterplots and correlation coefficients
for (i in 1:length(explanVars.quant)) {
for (j in 1:length(responseVars.quant)) {
print(paste0("Explanatory var: ", explanVars.quant[i]))
print(paste0("Response var: ", responseVars.quant[j]))
cormod <- cor.test(as.data.frame(cost)[,explanVars.quant[i]],
as.data.frame(cost)[,responseVars.quant[j]],
method = "spearman")
print(paste0("Spearman Correlation Coefficient: ", signif(cormod$estimate,5)))
print(paste0("p-value: ", signif(cormod$p.value,5)))
print(" ")
print(" ")
print(ggplot(data = cost) +
geom_point(mapping = aes(x = get(explanVars.quant[i]),
y = get(responseVars.quant[j]))) +
geom_smooth(mapping = aes(x = get(explanVars.quant[i]),
y = get(responseVars.quant[j])), method = 'lm') +
xlab(explanVars.quant[i]) + ylab(responseVars.quant[j]) +
labs(subtitle =
paste0(paste0("r-value: ", signif(cormod$estimate,5)),", ",
paste0("p-value: ", signif(cormod$p.value,5)))))
## plot residuals
print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost)) +
geom_point(aes(x = .fitted, y = .resid)) +
labs(title = paste0("Residuals vs fitted: ",explanVars.quant[i]))) # fitted vs residuals
print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost)) +
geom_histogram(aes(x = .resid)) +
labs(title = paste0("Residuals histogram: ",explanVars.quant[i])))
}
}
## [1] "Explanatory var: ptage"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties
## [1] "Spearman Correlation Coefficient: 0.12543"
## [1] "p-value: 0.0077943"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: ssi"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.28145"
## [1] "p-value: 1.2772e-09"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: iadl"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.28202"
## [1] "p-value: 1.1791e-09"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: social"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.16497"
## [1] "p-value: 0.00044822"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: charlson"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.32663"
## [1] "p-value: 1.2696e-12"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

All vars: Scatterplots and correlation coefficients
## Scatterplots and correlation coefficients
for (i in 1:length(explanVars.quant)) {
for (j in 1:length(responseVars.quant)) {
print(paste0("Explanatory var: ", explanVars.quant[i]))
print(paste0("Response var: ", responseVars.quant[j]))
cormod <- cor.test(as.data.frame(cost)[,explanVars.quant[i]],
as.data.frame(cost)[,responseVars.quant[j]],
method = "spearman")
print(paste0("Spearman Correlation Coefficient: ", signif(cormod$estimate,5)))
print(paste0("p-value: ", signif(cormod$p.value,5)))
print(" ")
print(" ")
print(ggplot(data = cost) +
geom_point(mapping = aes(x = get(explanVars.quant[i]),
y = get(responseVars.quant[j]))) +
geom_smooth(mapping = aes(x = get(explanVars.quant[i]),
y = get(responseVars.quant[j])), method = 'lm') +
xlab(explanVars.quant[i]) + ylab(responseVars.quant[j]) +
labs(subtitle =
paste0(paste0("r-value: ", signif(cormod$estimate,5)),", ",
paste0("p-value: ", signif(cormod$p.value,5)))))
## plot residuals
print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost)) +
geom_point(aes(x = .fitted, y = .resid)) +
labs(title = paste0("Residuals vs fitted: ",explanVars.quant[i]))) # fitted vs residuals
print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost)) +
geom_histogram(aes(x = .resid)) +
labs(title = paste0("Residuals histogram: ",explanVars.quant[i])))
}
}
## [1] "Explanatory var: ptage"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties
## [1] "Spearman Correlation Coefficient: 0.12543"
## [1] "p-value: 0.0077943"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: ssi"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.28145"
## [1] "p-value: 1.2772e-09"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: iadl"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.28202"
## [1] "p-value: 1.1791e-09"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: social"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: -0.16497"
## [1] "p-value: 0.00044822"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Explanatory var: charlson"
## [1] "Response var: outcosts2"
## Warning in cor.test.default(as.data.frame(cost)[, explanVars.quant[i]], :
## Cannot compute exact p-value with ties

## [1] "Spearman Correlation Coefficient: 0.32663"
## [1] "p-value: 1.2696e-12"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Variable selection
#
# FS <- regsubsets(as.formula(LinEq), data = cost, method = "forward", force.in = c("depcat","charlson"))
#
# str(summary(FS))
# str(FS)
#
# summary(FS)$rss
set.seed(123)
null <- lm(outcosts2~1, data = cost)
full <- lm(outcosts2~., data = cost)
FS <- step(null, scope=list(lower=null, upper=full), direction="forward")
## Start: AIC=6914.57
## outcosts2 ~ 1
##
## Df Sum of Sq RSS AIC
## + charlson 1 240985083 1938832720 6864.0
## + racecat 3 214228353 1965589450 6874.1
## + iadl 1 163770786 2016047017 6881.5
## + depcat 2 158562936 2021254867 6884.7
## + ssi 1 133484030 2046333773 6888.2
## + educat 4 127715950 2052101853 6895.5
## + whiteleycat 2 96805662 2083012142 6898.2
## + social 1 81518772 2098299031 6899.5
## + panic 1 34614794 2145203009 6909.4
## + anxiety 1 24581357 2155236447 6911.5
## + ptage 1 20088860 2159728943 6912.4
## + female 1 13917217 2165900586 6913.7
## + married 1 13095811 2166721993 6913.9
## <none> 2179817803 6914.6
##
## Step: AIC=6863.97
## outcosts2 ~ charlson
##
## Df Sum of Sq RSS AIC
## + racecat 3 152961582 1785871138 6833.1
## + depcat 2 110921296 1827911424 6841.5
## + ssi 1 85455902 1853376818 6845.7
## + iadl 1 82639449 1856193271 6846.4
## + whiteleycat 2 65028251 1873804469 6852.7
## + educat 4 78602638 1860230082 6853.4
## + social 1 45657600 1893175121 6855.3
## + anxiety 1 23073440 1915759280 6860.6
## + panic 1 20666597 1918166124 6861.2
## + female 1 19857339 1918975382 6861.3
## + married 1 18252581 1920580139 6861.7
## <none> 1938832720 6864.0
## + ptage 1 185287 1938647434 6865.9
##
## Step: AIC=6833.07
## outcosts2 ~ charlson + racecat
##
## Df Sum of Sq RSS AIC
## + depcat 2 66381509 1719489629 6820.1
## + ssi 1 44909757 1740961381 6823.6
## + iadl 1 33904439 1751966699 6826.5
## + whiteleycat 2 31551428 1754319710 6829.1
## + social 1 22803842 1763067296 6829.3
## + anxiety 1 18519335 1767351803 6830.4
## + panic 1 12978761 1772892377 6831.8
## + female 1 11621963 1774249175 6832.1
## <none> 1785871138 6833.1
## + married 1 4509966 1781361172 6833.9
## + ptage 1 265857 1785605281 6835.0
## + educat 4 20904789 1764966349 6835.8
##
## Step: AIC=6820.06
## outcosts2 ~ charlson + racecat + depcat
##
## Df Sum of Sq RSS AIC
## + ssi 1 21043687 1698445942 6816.5
## + iadl 1 18705072 1700784557 6817.2
## + female 1 8780270 1710709359 6819.8
## <none> 1719489629 6820.1
## + social 1 6794491 1712695138 6820.3
## + anxiety 1 5224667 1714264962 6820.7
## + panic 1 3947281 1715542348 6821.0
## + married 1 2954262 1716535366 6821.3
## + whiteleycat 2 8535564 1710954064 6821.8
## + ptage 1 23159 1719466469 6822.1
## + educat 4 17188031 1702301597 6823.6
##
## Step: AIC=6816.53
## outcosts2 ~ charlson + racecat + depcat + ssi
##
## Df Sum of Sq RSS AIC
## + female 1 8252837 1690193105 6816.3
## <none> 1698445942 6816.5
## + iadl 1 5552772 1692893170 6817.1
## + anxiety 1 2362417 1696083525 6817.9
## + married 1 2029243 1696416699 6818.0
## + panic 1 1062844 1697383098 6818.3
## + social 1 693520 1697752422 6818.4
## + ptage 1 513823 1697932119 6818.4
## + whiteleycat 2 4485912 1693960030 6819.3
## + educat 4 16018581 1682427361 6820.3
##
## Step: AIC=6816.35
## outcosts2 ~ charlson + racecat + depcat + ssi + female
##
## Df Sum of Sq RSS AIC
## <none> 1690193105 6816.3
## + iadl 1 5186509 1685006596 6817.0
## + anxiety 1 2842442 1687350663 6817.6
## + panic 1 944084 1689249021 6818.1
## + married 1 656885 1689536220 6818.2
## + social 1 579968 1689613137 6818.2
## + ptage 1 215038 1689978067 6818.3
## + educat 4 19514216 1670678889 6819.1
## + whiteleycat 2 4182099 1686011007 6819.2
## View best model
summary(FS)
##
## Call:
## lm(formula = outcosts2 ~ charlson + racecat + depcat + ssi +
## female, data = cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3808.1 -1200.9 -462.7 635.5 9036.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1054.88 516.32 2.043 0.0416 *
## charlson 544.88 88.82 6.135 1.90e-09 ***
## racecat2 272.10 253.73 1.072 0.2841
## racecat3 -16.91 324.81 -0.052 0.9585
## racecat4 -920.45 232.02 -3.967 8.49e-05 ***
## depcat2 909.12 444.18 2.047 0.0413 *
## depcat3 -282.31 339.11 -0.833 0.4056
## ssi 347.03 150.16 2.311 0.0213 *
## female1 313.07 213.59 1.466 0.1434
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1960 on 440 degrees of freedom
## Multiple R-squared: 0.2246, Adjusted R-squared: 0.2105
## F-statistic: 15.93 on 8 and 440 DF, p-value: < 2.2e-16
## Create vector of selected variables
selectedVars <- rownames(summary(FS)$coefficients)
selectedVars <- selectedVars[!selectedVars %in% c("(Intercept)")]
selectedVars
## [1] "charlson" "racecat2" "racecat3" "racecat4" "depcat2" "depcat3"
## [7] "ssi" "female1"
summary(FS)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1054.88163 516.32254 2.0430672 4.164106e-02
## charlson 544.87679 88.81502 6.1349624 1.900093e-09
## racecat2 272.09682 253.73241 1.0723771 2.841385e-01
## racecat3 -16.90764 324.81400 -0.0520533 9.585098e-01
## racecat4 -920.44524 232.01796 -3.9671292 8.491096e-05
## depcat2 909.11563 444.18219 2.0467179 4.127942e-02
## depcat3 -282.31156 339.10590 -0.8325174 4.055686e-01
## ssi 347.02618 150.16027 2.3110386 2.129213e-02
## female1 313.07143 213.59134 1.4657497 1.434307e-01
## do by p-value
# OLS.Mod <- paste0("outcosts2~depcat+charlson+",paste0(names(cost)[!names(cost) %in% c("charlson","depcat","outcosts2")], collapse = "+"))
# OLS <- ols(as.formula(OLS.Mod), data = cost)
#
# FS.p <- fastbw(OLS, rule = "p", sls = 0.05, force = 4)
Multicollinearity: variance inflation factors
selectedVars <- unique(tstrsplit(selectedVars, "(?=[A-Za-z])(?<=[0-9])|(?=[0-9])(?<=[A-Za-z])", perl=TRUE)[[1]])
LinEq <- paste0("outcosts2~",paste0(selectedVars, collapse = "+"))
LinEq
## [1] "outcosts2~charlson+racecat+depcat+ssi+female"
LinMod <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
##
## Call:
## lm(formula = as.formula(LinEq), data = cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3808.1 -1200.9 -462.7 635.5 9036.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1054.88 516.32 2.043 0.0416 *
## charlson 544.88 88.82 6.135 1.90e-09 ***
## racecat2 272.10 253.73 1.072 0.2841
## racecat3 -16.91 324.81 -0.052 0.9585
## racecat4 -920.45 232.02 -3.967 8.49e-05 ***
## depcat2 909.12 444.18 2.047 0.0413 *
## depcat3 -282.31 339.11 -0.833 0.4056
## ssi 347.03 150.16 2.311 0.0213 *
## female1 313.07 213.59 1.466 0.1434
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1960 on 440 degrees of freedom
## Multiple R-squared: 0.2246, Adjusted R-squared: 0.2105
## F-statistic: 15.93 on 8 and 440 DF, p-value: < 2.2e-16
vif(LinMod)
## GVIF Df GVIF^(1/(2*Df))
## charlson 1.060233 1 1.029676
## racecat 1.154487 3 1.024232
## depcat 1.306560 2 1.069135
## ssi 1.268000 1 1.126055
## female 1.027451 1 1.013633
## Remove if VIF > 10 - none in this case
## Now create table for adjusted coefficients
coefTable <-
data.frame(
variable =
rownames(summary(LinMod)$coefficients)[2:length(summary(LinMod)$coefficients[,1])],
coefficient =
as.numeric(summary(LinMod)$coefficients[2:length(summary(LinMod)$coefficients[,1]),1]),
LCI =
as.numeric(confint(LinMod)[2:length(summary(LinMod)$coefficients[,1]),1]),
UCI = as.numeric(confint(LinMod)[2:length(summary(LinMod)$coefficients[,1]),2]),
pvalue =
signif(as.numeric(summary(LinMod)$coefficients[2:length(summary(LinMod)$coefficients[,1]),4]),9))
kable(coefTable)
| charlson |
544.87679 |
370.32241 |
719.4312 |
0.0000000 |
| racecat2 |
272.09682 |
-226.58128 |
770.7749 |
0.2841385 |
| racecat3 |
-16.90764 |
-655.28737 |
621.4721 |
0.9585098 |
| racecat4 |
-920.44524 |
-1376.44641 |
-464.4441 |
0.0000849 |
| depcat2 |
909.11563 |
36.13322 |
1782.0980 |
0.0412794 |
| depcat3 |
-282.31156 |
-948.78016 |
384.1570 |
0.4055686 |
| ssi |
347.02618 |
51.90568 |
642.1467 |
0.0212921 |
| female1 |
313.07143 |
-106.71460 |
732.8575 |
0.1434307 |
## Now remove SSI which is likely an intermediary variable
LinEq.int <- paste0("outcosts2~",paste0(selectedVars[!selectedVars %in% c("ssi")], collapse = "+"))
LinEq.int
## [1] "outcosts2~charlson+racecat+depcat+female"
LinMod.int <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
##
## Call:
## lm(formula = as.formula(LinEq), data = cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3808.1 -1200.9 -462.7 635.5 9036.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1054.88 516.32 2.043 0.0416 *
## charlson 544.88 88.82 6.135 1.90e-09 ***
## racecat2 272.10 253.73 1.072 0.2841
## racecat3 -16.91 324.81 -0.052 0.9585
## racecat4 -920.45 232.02 -3.967 8.49e-05 ***
## depcat2 909.12 444.18 2.047 0.0413 *
## depcat3 -282.31 339.11 -0.833 0.4056
## ssi 347.03 150.16 2.311 0.0213 *
## female1 313.07 213.59 1.466 0.1434
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1960 on 440 degrees of freedom
## Multiple R-squared: 0.2246, Adjusted R-squared: 0.2105
## F-statistic: 15.93 on 8 and 440 DF, p-value: < 2.2e-16
vif(LinMod.int)
## GVIF Df GVIF^(1/(2*Df))
## charlson 1.060233 1 1.029676
## racecat 1.154487 3 1.024232
## depcat 1.306560 2 1.069135
## ssi 1.268000 1 1.126055
## female 1.027451 1 1.013633
Univariate OLS
coefTable.uni <- data.frame(variable = character(),coefficient = numeric(), LCI = numeric(), UCI = numeric(), pvalue = numeric())
for(i in 1:length(selectedVars)) {
LinEq.uni <- paste0("outcosts2~",selectedVars[i])
LinEq.uni
LinMod.uni <- lm(as.formula(LinEq.uni), data = cost)
coefTable.uni <- rbind(coefTable.uni, data.frame(variable = selectedVars[i],
coefficient = as.numeric(summary(LinMod.uni)$coefficients[2,1]),
LCI = confint(LinMod.uni)[2,1],
UCI = confint(LinMod.uni)[2,2],
pvalue = signif(summary(LinMod.uni)$coefficients[2,4], 9)
))
}
kable(coefTable.uni)
| charlson |
683.1861 |
503.056181 |
863.3161 |
0.0000000 |
| racecat |
524.9870 |
1.601816 |
1048.3722 |
0.0493059 |
| depcat |
938.2778 |
2.750733 |
1873.8048 |
0.0493330 |
| ssi |
786.0828 |
499.985526 |
1072.1800 |
0.0000001 |
| female |
401.0859 |
-64.020461 |
866.1923 |
0.0908156 |
Table 2: coefficients
table2 <- coefTable
# bind_cols(coefTable.uni,coefTable)
# table2 <- as.data.frame(sapply(table2 %>% dplyr::select(-variable,-variable1), function(x) signif(x, 5)))
kable(table2)
| charlson |
544.87679 |
370.32241 |
719.4312 |
0.0000000 |
| racecat2 |
272.09682 |
-226.58128 |
770.7749 |
0.2841385 |
| racecat3 |
-16.90764 |
-655.28737 |
621.4721 |
0.9585098 |
| racecat4 |
-920.44524 |
-1376.44641 |
-464.4441 |
0.0000849 |
| depcat2 |
909.11563 |
36.13322 |
1782.0980 |
0.0412794 |
| depcat3 |
-282.31156 |
-948.78016 |
384.1570 |
0.4055686 |
| ssi |
347.02618 |
51.90568 |
642.1467 |
0.0212921 |
| female1 |
313.07143 |
-106.71460 |
732.8575 |
0.1434307 |
## if p < 0.0001
table2[,c("pvalue")] <- sapply(table2[,c("pvalue")], function(x) ifelse(x < 0.0001, "< 0.0001",signif(x,5)))
## Create vector rownames and extract labels
# variableNames <- selectedVars
#
# variableNameLabels <- vector(length = length(variableNames))
#
# for(y in 1:length(variableNameLabels)) {
#
# variableNameLabels[y] <- labelled::var_label(cost[,variableNames[y]])
# }
## Collapse 95% CIs into cell with hazard ratios
#
# table2 <- table2 %>% mutate(variable = variableNameLabels,
# Coef1 = paste0(coefficient," (",LCI,",",UCI,")"),
# Coef2 = paste0(coefficient1," (",LCI1,",",UCI1,")")) %>% dplyr::select(variable,Coef1,pvalue,Coef2,pvalue1)
table2 <- table2 %>% mutate(variable = variable, coefficient = paste0(signif(coefficient,5),"(",signif(LCI,5),",",signif(UCI,5),")"), pvalue = pvalue)
## Warning: package 'bindrcpp' was built under R version 3.4.4
colnames(table2) <- c("Variable", "Unadjusted Coefficient (95% CI)","p-value","Adjusted Coefficient (95% CI)","p-value")
kable(table2)
| charlson |
544.88(370.32,719.43) |
370.32241 |
719.4312 |
< 0.0001 |
| racecat2 |
272.1(-226.58,770.77) |
-226.58128 |
770.7749 |
0.28414 |
| racecat3 |
-16.908(-655.29,621.47) |
-655.28737 |
621.4721 |
0.95851 |
| racecat4 |
-920.45(-1376.4,-464.44) |
-1376.44641 |
-464.4441 |
< 0.0001 |
| depcat2 |
909.12(36.133,1782.1) |
36.13322 |
1782.0980 |
0.041279 |
| depcat3 |
-282.31(-948.78,384.16) |
-948.78016 |
384.1570 |
0.40557 |
| ssi |
347.03(51.906,642.15) |
51.90568 |
642.1467 |
0.021292 |
| female1 |
313.07(-106.71,732.86) |
-106.71460 |
732.8575 |
0.14343 |
tab_df(table2, file = paste0(analysisDir,"Table2.doc"))
|
Variable
|
Unadjusted Coefficient (95% CI)
|
p-value
|
Adjusted Coefficient (95% CI)
|
p-value
|
|
charlson
|
544.88(370.32,719.43)
|
370.322411073949
|
719.431171800222
|
< 0.0001
|
|
racecat2
|
272.1(-226.58,770.77)
|
-226.58128068612
|
770.774913529939
|
0.28414
|
|
racecat3
|
-16.908(-655.29,621.47)
|
-655.287374936688
|
621.472095005024
|
0.95851
|
|
racecat4
|
-920.45(-1376.4,-464.44)
|
-1376.44640656538
|
-464.444067057424
|
< 0.0001
|
|
depcat2
|
909.12(36.133,1782.1)
|
36.1332221228739
|
1782.09804417907
|
0.041279
|
|
depcat3
|
-282.31(-948.78,384.16)
|
-948.780163442076
|
384.157045465912
|
0.40557
|
|
ssi
|
347.03(51.906,642.15)
|
51.9056756009429
|
642.146691484815
|
0.021292
|
|
female1
|
313.07(-106.71,732.86)
|
-106.714604524497
|
732.85747440742
|
0.14343
|
Effect modification
## Evaluate for effect modification by adding an interaction term
# cost <- cost %>% mutate(chdep = charlson*depcat)
# labelled::var_label(cost$chdep) <- 'Charlson*Depression Category Interaction Term' # label this new term
## Retrain the model with the new interaction term
LinEq <- paste0("outcosts2~",paste0(c(selectedVars), collapse = "+"),"+depcat*charlson")
LinEq
## [1] "outcosts2~charlson+racecat+depcat+ssi+female+depcat*charlson"
LinMod <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
##
## Call:
## lm(formula = as.formula(LinEq), data = cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3893.4 -1214.6 -463.0 666.4 9055.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1044.72 540.11 1.934 0.0537 .
## charlson 562.42 226.90 2.479 0.0136 *
## racecat2 258.06 254.16 1.015 0.3105
## racecat3 -28.67 327.69 -0.087 0.9303
## racecat4 -931.26 232.60 -4.004 7.32e-05 ***
## depcat2 673.12 533.43 1.262 0.2077
## depcat3 -239.17 388.77 -0.615 0.5387
## ssi 350.66 150.36 2.332 0.0201 *
## female1 305.93 213.83 1.431 0.1532
## charlson:depcat2 250.77 332.31 0.755 0.4509
## charlson:depcat3 -71.07 249.94 -0.284 0.7763
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1961 on 438 degrees of freedom
## Multiple R-squared: 0.2273, Adjusted R-squared: 0.2096
## F-statistic: 12.88 on 10 and 438 DF, p-value: < 2.2e-16
vif(LinMod)
## GVIF Df GVIF^(1/(2*Df))
## charlson 6.912099 1 2.629087
## racecat 1.180298 3 1.028013
## depcat 2.448953 2 1.250965
## ssi 1.269971 1 1.126930
## female 1.028575 1 1.014187
## charlson:depcat 10.274368 2 1.790353
## Now remove SSI
## Retrain the model with the new interaction term
LinEq <- paste0("outcosts2~",paste0(c(selectedVars[!selectedVars %in% c("ssi")]), collapse = "+"),"+depcat*charlson")
LinEq
## [1] "outcosts2~charlson+racecat+depcat+female+depcat*charlson"
LinMod <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
##
## Call:
## lm(formula = as.formula(LinEq), data = cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3463.0 -1232.2 -474.2 634.5 9339.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1856.86 414.93 4.475 9.74e-06 ***
## charlson 597.74 227.54 2.627 0.00892 **
## racecat2 311.60 254.39 1.225 0.22127
## racecat3 -14.05 329.28 -0.043 0.96598
## racecat4 -979.14 232.86 -4.205 3.17e-05 ***
## depcat2 606.81 535.36 1.133 0.25764
## depcat3 -501.71 373.99 -1.341 0.18046
## female1 316.39 214.86 1.473 0.14159
## charlson:depcat2 220.53 333.73 0.661 0.50909
## charlson:depcat3 -87.75 251.10 -0.349 0.72692
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1971 on 439 degrees of freedom
## Multiple R-squared: 0.2177, Adjusted R-squared: 0.2016
## F-statistic: 13.57 on 9 and 439 DF, p-value: < 2.2e-16
vif(LinMod)
## GVIF Df GVIF^(1/(2*Df))
## charlson 6.881309 1 2.623225
## racecat 1.150483 3 1.023639
## depcat 2.186324 2 1.215986
## female 1.028123 1 1.013964
## charlson:depcat 10.258419 2 1.789658
Session Info
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 leaps_3.0 car_3.0-0
## [4] carData_3.0-1 stargazer_5.2.1 sjPlot_2.4.1
## [7] tableone_0.9.3 ppcor_1.1 MASS_7.3-49
## [10] lmSupport_2.9.13 lsmeans_2.27-62 e1071_1.6-8
## [13] epiR_0.9-97 survival_2.41-3 moments_0.14
## [16] readxl_1.0.0 knitr_1.20 data.table_1.10.4-3
## [19] forcats_0.3.0 stringr_1.3.0 dplyr_0.7.4
## [22] purrr_0.2.4 readr_1.1.1 tidyr_0.8.0
## [25] tibble_1.4.2 ggplot2_2.2.1 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.2 blme_1.0-4 VGAM_1.0-6
## [4] plyr_1.8.4 lazyeval_0.2.1 sp_1.2-7
## [7] TMB_1.7.13 splines_3.4.3 unmarked_0.12-2
## [10] TH.data_1.0-8 digest_0.6.15 htmltools_0.3.6
## [13] gdata_2.18.0 magrittr_1.5 AICcmodavg_2.1-1
## [16] openxlsx_4.0.17 modelr_0.1.1 sandwich_2.4-0
## [19] colorspace_1.3-2 rvest_0.3.2 BiasedUrn_1.07
## [22] haven_1.1.1 crayon_1.3.4 jsonlite_1.5
## [25] lme4_1.1-17 bindr_0.1.1 zoo_1.8-1
## [28] glue_1.2.0 gtable_0.2.0 emmeans_1.1.3
## [31] sjstats_0.14.2-3 sjmisc_2.7.1 abind_1.4-5
## [34] scales_0.5.0 mvtnorm_1.0-7 ggeffects_0.3.2
## [37] Rcpp_0.12.16 xtable_1.8-2 merTools_0.3.0
## [40] foreign_0.8-69 stats4_3.4.3 prediction_0.2.0
## [43] survey_3.33-2 DT_0.4 htmlwidgets_1.0
## [46] httr_1.3.1 gplots_3.0.1 modeltools_0.2-21
## [49] pkgconfig_2.0.1 reshape_0.8.7 nnet_7.3-12
## [52] utf8_1.1.3 tidyselect_0.2.4 labeling_0.3
## [55] rlang_0.2.0 reshape2_1.4.3 later_0.7.4
## [58] munsell_0.4.3 cellranger_1.1.0 tools_3.4.3
## [61] cli_1.0.0 sjlabelled_1.0.8 broom_0.4.4
## [64] ggridges_0.5.0 evaluate_0.10.1 arm_1.9-3
## [67] yaml_2.1.18 caTools_1.17.1 coin_1.2-2
## [70] nlme_3.1-137 mime_0.5 xml2_1.2.0
## [73] compiler_3.4.3 pbkrtest_0.4-7 bayesplot_1.5.0
## [76] rstudioapi_0.7 curl_3.2 stringi_1.1.7
## [79] highr_0.6 lattice_0.20-35 Matrix_1.2-13
## [82] psych_1.8.3.3 nloptr_1.0.4 effects_4.0-1
## [85] stringdist_0.9.4.7 pillar_1.2.1 pwr_1.2-2
## [88] lmtest_0.9-36 estimability_1.3 bitops_1.0-6
## [91] raster_2.6-7 httpuv_1.4.5 R6_2.2.2
## [94] promises_1.0.1 KernSmooth_2.23-15 rio_0.5.10
## [97] codetools_0.2-15 gtools_3.5.0 assertthat_0.2.0
## [100] rprojroot_1.3-2 mnormt_1.5-5 multcomp_1.4-8
## [103] parallel_3.4.3 hms_0.4.2 grid_3.4.3
## [106] labelled_1.0.1 coda_0.19-1 glmmTMB_0.2.0
## [109] class_7.3-14 minqa_1.2.4 rmarkdown_1.9
## [112] snakecase_0.9.1 gvlma_1.0.0.2 shiny_1.1.0
## [115] lubridate_1.7.3